105 lines (104 with data), 2.5 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import shutil\n",
"src = \"/home/serge/database/data/genomes/bacillus/ncbi-genomes-2019-06-25/train/\"\n",
"t = \"/home/serge/database/data/genomes/bacillus/ncbi-genomes-2019-06-25\"\n",
"\n",
"src_files = os.listdir(src)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3966"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(src_files)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"num=500\n",
"\n",
"for i, file_name in enumerate(src_files):\n",
" if i % num == 0:\n",
" dest = f\"{t}/{i}/\"\n",
" if not os.path.exists(dest):\n",
" os.mkdir(dest)\n",
" full_file_name = os.path.join(src, file_name)\n",
" if os.path.isfile(full_file_name):\n",
" shutil.copy(full_file_name, dest)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"train, test = train_test_split(src_files,test_size=0.2, random_state=42)\n",
"portions = {\"train\":train, \"valid\":test}\n",
"\n",
"for part in portions.keys():\n",
" for file_name in portions[part]:\n",
" dest = f\"{t}/{part}\"\n",
" if not os.path.exists(dest):\n",
" os.mkdir(dest)\n",
" full_file_name = os.path.join(src, file_name)\n",
" if os.path.isfile(full_file_name):\n",
" shutil.copy(full_file_name, dest)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:bio] *",
"language": "python",
"name": "conda-env-bio-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}